import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install catboost
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
print('Environment: Google Colab')
sys.path.append("/Users/poudel/Dropbox/a00_Resources/hyperband")
try:
from search import HyperbandSearchCV
print('File found: search.py')
except:
print('File not found: search.py')
try:
from hyperband_search import HyperbandSearchCV
print('File found: hyperband_search.py')
except:
print('File not found: hyperband_search.py')
File not found: search.py File found: hyperband_search.py
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
# modelling
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics
# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
# boosting
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
seaborn 0.11.0 xgboost 1.2.0 pandas 1.1.4 plotly_express 0.4.1 autopep8 1.5.2 numpy 1.19.4 imblearn 0.7.0 lightgbm 2.3.1 catboost 0.23.2 json 2.0.9 joblib 0.17.0
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
| 5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
| 5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)
df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)
df_Xtrain.head(2)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
| 1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['tenure', 'MonthlyCharges', 'TotalCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num = ['TotalCharges','tenure', 'MonthlyCharges']
cols_num_old = cols_num
cols_cat_old = cols_cat
def combine_two_features(dfx,A,B):
dfx = dfx.copy()
assert len(A) == len(B)
for a,b in zip(A,B):
dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]
return dfx
combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
'TechSupport','PaymentMethod']
cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]
cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)
df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)
['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']
def create_groupby_features(dfx,cat,num,agg):
dfx = dfx.copy()
for c in cat:
for n in num:
for a in agg:
name = f"{c}_{n}_{a}"
dfx[name] = df_train.groupby(c)[n].transform(a)
return dfx
# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']
cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']
cols_num_new = [f'{c}_{n}_{a}'
for c in cols_grpcat
for n in cols_grpnum
for a in cols_grpagg]
cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)
df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)
['Contract_TotalCharges_mean']
df_Xtrain.head(2)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
| 1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No_No | No_No | No_No | No_Month-to-month | No_Yes | No_Bank transfer (automatic) | 1370.923131 |
cols_drop = ['gender']
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)
all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
for i in cols_cat]
# make sure no nans
df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_full, ser_ytrain_full,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_full)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (5634, 21) df_Xtrain : (4507, 25) ser_ytrain : (4507,) df_Xvalid : (1127, 25) ser_yvalid : (1127,) df_test : (1409, 21) ser_ytest : This does not exist.
| SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4555 | No | No | No | 16 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Credit card (automatic) | 19.75 | 294.90 | No_No | No_No | No_No | No_Month-to-month | No_No internet service | No_Credit card (automatic) | 1370.923131 |
| 3379 | No | Yes | No | 72 | No | No phone service | DSL | Yes | Yes | Yes | Yes | Yes | Yes | Two year | Yes | Electronic check | 64.70 | 4746.05 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Electronic check | 3683.643192 |
Regression Objectives:
MAE MAPE Poisson Quantile RMSE Huber Tweedie SMAPE R2 MSLE etc.
Classification Objectives:
Logloss CrossEntroy Precision Recall F1 BalancedAccuracy
Multiclassification objectives:
MultiClass MultiClassOneVsAll Precision Recall F1 TotalF1 MCC
Accuracy HingeLoss ZeroOneLoss Kappa WKappa AUC
#============================================================
catboost.CatBoostClassifier(
iterations = None, # n_estimators, num_trees, num_boost_round
learning_rate = None, # eta
depth = None, # max_depth
l2_leaf_reg = None, # reg_lambda
scale_pos_weight = None,
random_seed = None, # random_state
use_best_model = None,
verbose = None, # verbose_eval
silent = None,
logging_level = None, # silent verbose info debug
ignored_features = None,
cat_features = None, # indices or names
text_features = None,
one_hot_max_size = None,
objective = None, # loss_function
custom_loss = None,
custom_metric = None,
eval_metric = None,
score_function = None, # Cosine L2 NewtonCosine NewtonL2
subsample = None,
colsample_bylevel = None,
max_depth = None,
n_estimators = None,
num_boost_round = None,
num_trees = None,
early_stopping_rounds = None,
grow_policy = None,
classes_count = None,
class_weights = None, # list dict {0:1.0, 1:0.5}
set 1 for zero, then weight = sum_neg/sum_pos for class one.
Do not use this parameter with auto_class_weights and scale_pos_weight.
auto_class_weights = None,
class_names = None,
save_snapshot = None,
snapshot_file = None,
snapshot_interval = None
)
#===========================================================
from catboost.utils import eval_metric
from math import log
labels = [1, 0, 1]
probabilities = [0.4, 0.1, 0.9]
# In binary classification it is necessary to apply the logit function
# to the probabilities to get approxes.
logit = lambda x: log(x / (1 - x))
approxes = list(map(logit, probabilities))
accuracy = eval_metric(labels, approxes, 'Accuracy')
#======================================================
class LoglossMetric(object):
def get_final_error(self, error, weight):
return error / (weight + 1e-38)
def is_max_optimal(self):
return False
def evaluate(self, approxes, target, weight):
assert len(approxes) == 1
assert len(target) == len(approxes[0])
approx = approxes[0]
error_sum = 0.0
weight_sum = 0.0
for i in range(len(approx)):
e = np.exp(approx[i])
p = e / (1 + e)
w = 1.0 if weight is None else weight[i]
weight_sum += w
error_sum += -w * (target[i] * np.log(p) + (1 - target[i]) * np.log(1 - p))
return error_sum, weight_sum
model = CatBoostClassifier(eval_metric=LoglossMetric())
Catboost classifier fit
catboost.CatBoostClassifier.fit(X,y,
cat_features = None,
text_features = None,
sample_weight = None,
baseline = None,
use_best_model = None,
eval_set = None,
verbose = None,
logging_level = None,
plot = False,
column_description = None,
verbose_eval = None,
metric_period = None,
silent = None,
early_stopping_rounds = None,
save_snapshot = None,
snapshot_file = None,
snapshot_interval = None,
init_model = None,
from catboost import CatBoostClassifier
# CatBoostClassifier?
import catboost
show_methods(catboost)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | CatBoost | EFstrType | Pool | to_regressor |
| 1 | CatBoostClassifier | FeaturesData | core | train |
| 2 | CatBoostError | MetricVisualizer | cv | version |
| 3 | CatBoostRegressor | MultiRegressionCustomMetric | sum_models | widget |
| 4 | CatboostError | MultiRegressionCustomObjective | to_classifier |
show_methods(catboost.CatBoostClassifier)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | best_iteration_ | get_best_iteration | get_test_evals | random_seed_ |
| 1 | best_score_ | get_best_score | get_text_feature_indices | randomized_search |
| 2 | calc_feature_statistics | get_borders | get_tree_leaf_counts | save_borders |
| 3 | calc_leaf_indexes | get_cat_feature_indices | grid_search | save_model |
| 4 | classes_ | get_evals_result | is_fitted | score |
| 5 | compare | get_feature_importance | iterate_leaf_indexes | set_feature_names |
| 6 | copy | get_leaf_values | learning_rate_ | set_leaf_values |
| 7 | create_metric_calcer | get_leaf_weights | load_model | set_params |
| 8 | drop_unused_features | get_metadata | plot_partial_dependence | set_scale_and_bias |
| 9 | eval_metrics | get_object_importance | plot_predictions | shrink |
| 10 | evals_result_ | get_param | plot_tree | staged_predict |
| 11 | feature_importances_ | get_params | predict | staged_predict_log_proba |
| 12 | feature_names_ | get_scale_and_bias | predict_log_proba | staged_predict_proba |
| 13 | fit | get_test_eval | predict_proba | tree_count_ |
| 14 | get_all_params |
# catboost.CatBoostClassifier.fit?
catboost.CatBoostClassifier().fit
<bound method CatBoostClassifier.fit of <catboost.core.CatBoostClassifier object at 0x7fe03b76dc90>>
df_Xtrain.head()
| SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4555 | No | No | No | 16 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Credit card (automatic) | 19.75 | 294.90 | No_No | No_No | No_No | No_Month-to-month | No_No internet service | No_Credit card (automatic) | 1370.923131 |
| 3379 | No | Yes | No | 72 | No | No phone service | DSL | Yes | Yes | Yes | Yes | Yes | Yes | Two year | Yes | Electronic check | 64.70 | 4746.05 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Electronic check | 3683.643192 |
| 1713 | No | Yes | No | 67 | Yes | Yes | Fiber optic | No | Yes | Yes | Yes | Yes | Yes | One year | Yes | Credit card (automatic) | 109.70 | 7344.45 | Yes_No | No_No | No_Yes | No_One year | No_Yes | No_Credit card (automatic) | 3018.965636 |
| 2399 | Yes | Yes | No | 47 | Yes | Yes | Fiber optic | No | No | Yes | No | Yes | Yes | Month-to-month | No | Electronic check | 99.70 | 4747.20 | Yes_No | Yes_No | Yes_Yes | Yes_Month-to-month | Yes_No | Yes_Electronic check | 1370.923131 |
| 1096 | No | Yes | No | 46 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | Two year | No | Credit card (automatic) | 40.40 | 1842.70 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
print('cat_features: ', sorted(cols_cat_idx))
cat_features: [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23]
model = CatBoostClassifier(
n_estimators=1000,
random_state=SEED,
cat_features=cols_cat_idx,
scale_pos_weight=4
)
model.fit(df_Xtrain,ser_ytrain,plot=True,verbose=False,
eval_set=(df_Xvalid,ser_yvalid),
use_best_model=True,
early_stopping_rounds=50
)
<catboost.core.CatBoostClassifier at 0x7fe03d0f5450>
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('catboost',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f'profit = ${profit:,d}')
precision recall f1-score support
0 0.90 0.71 0.79 1035
1 0.49 0.78 0.60 374
accuracy 0.73 1409
macro avg 0.70 0.74 0.70 1409
weighted avg 0.79 0.73 0.74 1409
[[737 298]
[ 84 290]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| catboost | 0.7289 | 0.4932 | 0.7754 | 0.6029 | 0.7437 |
profit = $69,400
%%time
import scipy.stats as stats
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.optimize.linesearch import LineSearchWarning
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.simplefilter('ignore', category=LineSearchWarning)
# Define our model
params_fixed = dict(random_state= SEED)
resource = 'iterations'
params_hyp = {
'depth' : stats.randint(3,12),
'learning_rate' : stats.loguniform(0.01, 1.0),
resource : stats.randint(100, 1000),
'subsample' : [0.6, 0.7, 0.8, 0.9, 1.0],
'reg_lambda' : stats.loguniform(0.01, 1.0),
'scale_pos_weight' : [1,2,3,4,5]
}
model = CatBoostClassifier(**params_fixed)
# Perform Hyperparameter Tuning
cv = RepeatedStratifiedKFold(n_splits=5,
n_repeats=5,
random_state=SEED)
grid = HyperbandSearchCV(model,params_hyp,
resource_param = resource,
min_iter = 100,
max_iter = 500, # use 1k or 2k
cv = cv,
scoring = scoring, # roc_auc
refit = True,
verbose = 0,
random_state = SEED
)
# grid.fit(df_Xtrain, ytrain) # this does not work.
# print('Best parameters: ', grid.best_params_)
# params_best = grid.best_params_
# params = params_fixed
# params.update(params_best)
# print(params)
# Wall time:
CPU times: user 2.54 ms, sys: 30 µs, total: 2.57 ms Wall time: 2.61 ms
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING) # use INFO to see progress
from optuna.pruners import SuccessiveHalvingPruner
show_methods(optuna)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Any | delete_study | load_study | structs |
| 1 | Study | distributions | logging | study |
| 2 | TYPE_CHECKING | exceptions | multi_objective | trial |
| 3 | Trial | get_all_study_summaries | progress_bar | type_checking |
| 4 | TrialPruned | importance | pruners | types |
| 5 | create_study | importlib | samplers | version |
| 6 | create_trial | integration | storages | visualization |
| 7 | dashboard |
def objective(trial):
params_cat_optuna = {
'n_estimators': trial.suggest_int('n_estimators', 100,2000),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
'max_depth': trial.suggest_int('max_depth', 3, 12),
'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [2,3,4,5]),
'reg_lambda': trial.suggest_uniform('reg_lambda', 0.01, 1),
'subsample': trial.suggest_uniform('reg_lambda', 0.6, 1),
'used_ram_limit': '3gb'
}
# fit the model
model = CatBoostClassifier(random_state=SEED,
cat_features=cols_cat_idx,
**params_cat_optuna)
model.fit(df_Xtrain, ser_ytrain,
eval_set=[(df_Xvalid, ser_yvalid)],
use_best_model=True,
verbose=0,
early_stopping_rounds=100)
ypreds = model.predict(df_Xvalid)
ypreds = np.rint(ypreds)
#score = skmetrics.roc_auc_score(ser_yvalid.to_numpy().ravel(),ypreds)
score = get_profit(ser_yvalid.to_numpy().ravel(),ypreds)
return score
# NOTE: there is inherent non-determinism in optuna hyperparameter selection
# we may not get the same hyperparameters when run twice.
params_optuna_study = dict(
direction='maximize',
sampler=optuna.samplers.TPESampler(seed=SEED),
study_name='catboost_optuna',
storage='sqlite:///catboost_optuna_churn.db',
load_if_exists=True,
pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=100)
)
study = optuna.create_study(**params_optuna_study)
N_TRIALS = 1 # make it large
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/optuna/trial/_trial.py:742: RuntimeWarning:
Inconsistent parameter values for distribution with name "reg_lambda"! This might be a configuration mistake. Optuna allows to call the same distribution with the same name more then once in a trial. When the parameter values are inconsistent optuna only uses the values of the first call and ignores all following. Using these values: {'low': 0.01, 'high': 1}
%%time
# Resume from last time
N_TRIALS = 10 # make it large
study = optuna.create_study(**params_optuna_study)
study.optimize(objective, n_trials=N_TRIALS,timeout=600)
CPU times: user 1min 21s, sys: 5.73 s, total: 1min 27s Wall time: 31.3 s
print(f'Number of finished trials: {len(study.trials)}')
# best trail
best_trial = study.best_trial
# best params
params_best = study.best_trial.params
params_best
Number of finished trials: 87
{'learning_rate': 0.03702333216928129,
'max_depth': 6,
'n_estimators': 376,
'reg_lambda': 0.6177177549044198,
'scale_pos_weight': 5}
# params_best = {'learning_rate': 0.03702333216928129,
# 'max_depth': 6,
# 'n_estimators': 376,
# 'reg_lambda': 0.6177177549044198,
# 'scale_pos_weight': 5}
model = CatBoostClassifier(**params_best,cat_features=cols_cat_idx,
verbose=False,random_state=SEED)
model.fit(df_Xtrain_full,ytrain_full)
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f"profit = {profit:,d}")
model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=True)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))